# Choose working location as jane or nemo
location <- "jane"
if (location == "jane") {
base_dir <- "/Volumes/lab-gandhis/home/users/wagena/periph_immune_neurodegen_publication/"
} else if (location == "nemo") {
base_dir <- here::here()
options(bitmapType='cairo') # Add graphics device for nemo
}
args <- list(PD_genes_file = str_c(base_dir, "/derived_data/Blauendraat_lancetneuro2020_table1_PDgenes.csv"),
AD_genes_file = str_c(base_dir, "/derived_data/Goate_neurobiodis2020_AD_genes_w_destrooper.csv"),
FTD_genes_file = str_c(base_dir, "/derived_data/Koch_ijms2023_FTD_genes_table1.csv"),
aquino_eqtl_dir_path = file.path(base_dir,
"raw_data",
"GWAS",
"aquino_2023_covid_eqtl"),
aquino_eqtl_summary_path = file.path(base_dir,
"raw_data",
"GWAS",
"aquino_2023_covid_eqtl/aquino_summary_table.csv"),
chen_2020_locus_to_gene_path = file.path(base_dir,
"raw_data",
"GWAS",
"chen_cell_2020",
"open_targets_locus_to_gene_neurodegen.csv"),
chen_2020_substudy_lookup_table_path = file.path(base_dir,
"raw_data",
"GWAS",
"chen_cell_2020",
"chen_cell_2020_study_subset_lookup_table.tsv"),
gwas_summary_numbers = file.path(base_dir, "raw_data/gwas_summary_numbers_supp_figure.csv"),
qtl_summary_numbers = file.path(base_dir, "raw_data/metadata_of_GWAS_QTL_included.csv"),
gtex_gene_expression_per_tissue_file = file.path(base_dir, "raw_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct")
)
# Set defaults for ggplots
theme_aw <- theme_bw(base_family = "Helvetica",
base_size = 14) +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
strip.background = element_blank(),
legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title.y = element_text(vjust = 0.6),
panel.spacing = unit(0.1, "lines"))
theme_set(theme_aw)
plot_labels = c("PD2019_ex23andMe" = "European PD GWAS",
"foo2020" = "East Asian PD GWAS",
"rizig2023" = "African PD GWAS",
"SNCA" = "SNCA",
"GBA" = "GBA",
"LRRK2" = "LRRK2",
"BIN1" = "BIN1",
"Monocyte_COV" = "Monocyte + SARS-CoV-2",
"Monocyte_NS" = "Monocyte",
"Dendritic-plasmacytoid_NS" = "Dendritic Plasmacytoid",
"Mono-cd14_NS" = "Monocyte CD14+",
"NS" = "-",
"COV" = "SARS-CoV-2",
"IAV" = "Influenza A",
"T.CD4" = "CD4 T",
"T.CD8" = "CD8 T",
"NK" = "NK",
"MYELOID" = "MYELOID",
"B" = "B",
"true_result" = "True result",
'all_gtex_genes' = "All genes",
"brain_gtex_genes" = "Brain genes")
plot_labeller = as_labeller(plot_labels)
my_palette <- c("Colocalised" = "darkslategrey",
"Suggestive" = "aquamarine3",
"Distinct" = "indianred3",
"Not_signif" = "grey70",
"PPH4" = "darkslategrey",
"PPH3" = "indianred3",
"Afr_As_Eur" = "indianred3",
"As" = "darkslategrey",
"Eur" = "aquamarine3",
"Afr" = "deepskyblue3",
"Afr_Eur" = "darkslateblue",
"Expressed" = "darkslategrey",
"Not Expressed" = "aquamarine3",
"WBC association" = "darkslategrey",
"Blood count association" = "darkslategrey",
"No association" = "aquamarine3",
"White blood cell count - negative β" = "slategrey",
"White blood cell count - positive β" = "darkslategrey",
"Monocyte count - negative β" = "aquamarine3",
"Monocyte count - positive β" = "aquamarine4",
"Lymphocyte counts - negative β" = "indianred3",
"Lymphocyte counts - positive β" = "indianred4",
"Neutrophil count - negative β" = "slateblue2",
"Neutrophil count - positive β" = "darkslateblue",
"Basophil count - negative β" = "slategrey",
"Basophil count - positive β" = "darkslategrey",
"Eosinophil counts - negative β" = "aquamarine3",
"Eosinophil counts - positive β" = "aquamarine4",
"Platelet count - negative β" = "indianred3",
"Platelet count - positive β" = "indianred4",
"Mean platelet volume - negative β" = "slateblue2",
"Mean platelet volume - positive β" = "darkslateblue",
"true_result" = "darkslategrey",
'all_gtex_genes' = "indianred4",
"brain_gtex_genes" = "indianred3")Aim: This analysis explores the expression of genes causally implicated in neurodegenerative diseases in peripheral immune cells.
The PD genes of interest come from Blaundraat Lancet Neurology 2020 table 1, using only the 14 genes that have high or very high confidence of being an actual PD gene (as defined by the authors).
PD_gene_df <- read_delim(args$PD_genes_file) %>%
dplyr::filter(Confidence_as_actual_PD_gene %in% c("Very high", "High")) %>%
dplyr::rename(Gene = `Empty Cell`) #%>%
# dplyr::mutate(Gene = dplyr::case_when(
# Gene == "GBA" ~ "GBA1", # Replace "GBA" with the proper name, "GBA1"
# ))
PD_genes <- tibble(disease = "PD",
gene = c(PD_gene_df$Gene, "RAB32")) # Remove KAT8 and CTSF as can't cherrypick here...
gt(PD_gene_df) %>%
opt_interactive()The AD genes of interest were derived from Alison Goates Genetic architecture of AD in Neurobiology of disease (2020). The genes selected were those highlighted in red, indicated increased confidence of causation.
AD_gene_df <- read_delim(args$AD_genes_file) %>%
dplyr::filter(Confident_goate == "Y") %>%
dplyr::select(-Confident_destrooper, -Reference)
AD_genes <- tibble(disease = "AD",
gene = AD_gene_df$`Reported gene`)
gt(AD_gene_df) %>%
opt_interactive(use_text_wrapping = T)The FTD genes were derived from Antonioni et al Frontotemporal Dementia, Where Do We Stand? A Narrative Review. It takes the causative genes listed in table 1, and filters for those that account for at least 1% of disease.
FTD_gene_df <- read_delim(args$FTD_genes_file) %>%
dplyr::filter(Frequency %in% c("10%", "5%", "1%"))
FTD_genes <- tibble(disease = "FTD",
gene = FTD_gene_df$Gene)
FTD_gene_df %>%
gt() %>%
opt_interactive(use_text_wrapping = T)summary_genes <- bind_rows(PD_genes, AD_genes, FTD_genes) %>%
dplyr::mutate(disease = factor(disease, levels = c("AD", "PD", "FTD"))) %>%
arrange(disease, gene)
summary_gene_list <- summary_genes$gene %>% unlist()
write_csv(summary_genes,
file = file.path(base_dir, "figures/summary_genes_table.csv"))
# # For thesis
# # Define each vector of genes
# AD_genes_vector <- AD_genes[["gene"]] %>% sort()
# PD_genes_vector <- PD_genes[["gene"]] %>% sort()
# FTD_genes_vector <- FTD_genes[["gene"]] %>% sort()
#
# # Find the maximum length among the vectors
# max_length <- max(length(AD_genes_vector), length(PD_genes_vector), length(FTD_genes_vector))
#
# # Pad each vector with NA to match the maximum length
# AD_genes_padded <- c(AD_genes_vector, rep(NA, max_length - length(AD_genes_vector)))
# PD_genes_padded <- c(PD_genes_vector, rep(NA, max_length - length(PD_genes_vector)))
# FTD_genes_padded <- c(FTD_genes_vector, rep(NA, max_length - length(FTD_genes_vector)))
#
# # Combine the padded vectors into a tibble
# summary_genes_for_thesis <- tibble(
# AD = AD_genes_padded,
# PD = PD_genes_padded,
# FTD = FTD_genes_padded
# )
#
# summary_genes_for_thesis %>%
# write_csv(file = file.path(base_dir, "figures/summary_genes_for_thesis.csv"))
#
# summary_genes_gt <- gt(summary_genes_for_thesis) %>%
# sub_missing(missing_text = "") %>%
# tab_style(
# style = cell_text(weight = "bold"),
# locations = cells_column_labels()
# )
#
# summary_genes_gt %>%
# gtsave(filename = file.path(base_dir, "figures/summary_genes_gt.png"))
summary_genes <- read_csv(file = file.path(base_dir, "figures/summary_genes_table.csv")) %>%
as_tibble()
summary_genes %>%
count(disease) %>%
dplyr::rename(`Number of genes` = n) %>%
gt()| disease | Number of genes |
|---|---|
| AD | 18 |
| FTD | 6 |
| PD | 15 |
4 different eQTL studies were included in the study. A multi-ancestry study (Aquino et al), an East Asian study (Ota et al), and 2 African/African admixed studies each covering a single peripheral immune cell: Nedelec et al exploring macrophages, and Quach et al exploring monocytes. All of these studies included peripheral immune cells that were stimulated, either in vitro using microbial or chemical stimuli, or, as in the case of Ota et al, in utilising human donors with a range of immune related diseases.
study_factor_order <- c("Aquino_2023", "Ota_2021", "Nedelec_2016" , "Quach_2016")
study_summary <- read_csv(args$qtl_summary_numbers)
eqtl_summary <- study_summary %>%
dplyr::filter(Study %in% c("Aquino_2023", "Ota_2021", "Nedelec_2016" , "Quach_2016")) %>%
dplyr::mutate(Study = fct_relevel(Study, study_factor_order))
# Transform Stimulation into a factor with levels for better control in plotting
eqtl_summary$Ethnicities <- factor(eqtl_summary$Ethnicities)
# Separate the metrics into individual plots with specific aesthetics
individuals_plot <- ggplot(eqtl_summary, aes(x = "N_individuals", y = fct_rev(Study))) +
geom_point(aes(size = N_individuals), colour = "darkslategrey") + # Circle shape
scale_size_continuous(range = c(1, 12), name = "N Individuals") +
theme_minimal() +
theme(axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
legend.key.size = unit(0.4, 'lines')) +
labs(x = NULL, y = NULL)
celltypes_plot <- ggplot(eqtl_summary, aes(x = "N_celltypes", y = fct_rev(Study))) +
geom_point(aes(size = N_celltypes), colour = "aquamarine3") + # Square shape
scale_size_continuous(range = c(1, 12), name = "N Celltypes") +
theme_minimal() +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
legend.key.size = unit(0.4, 'lines')) +
labs(x = NULL)
ethnicity_plot <- ggplot(eqtl_summary, aes(x = "Ancestry", y = fct_rev(Study))) +
geom_point(aes(color = Ethnicities), shape = 15, size = 10) + # Same shape for all ethnicities
scale_color_manual(values = my_palette) +
labs(x = NULL, y = NULL, color = "Ethnicities") +
theme_minimal() +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
legend.key.size = unit(0.4, 'lines'))
stimulation_plot <- ggplot(eqtl_summary, aes(x = "Stimulation", y = fct_rev(Study))) +
geom_point(aes(size = ifelse(is.na(Stimulation), NA, 10)), # Use conditional size
shape = 15) +
scale_size_continuous(range = c(0, 15), guide = "none") + # Set scale for size, remove legend
labs(x = NULL, y = NULL) +
theme_minimal() +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
legend.key.size = unit(0.4, 'lines'))
# Combine the plots
eqtl_combined_studies_plot <- (individuals_plot | celltypes_plot | ethnicity_plot | stimulation_plot) +
plot_layout(guides = "collect") &
theme(legend.position = "bottom",
legend.direction = "vertical")
# Print the combined plot
eqtl_combined_studies_plotgwas_studies <- read_csv(args$gwas_summary_numbers) %>%
dplyr::mutate(Study = fct_relevel(Study, c("Foo_2020", "Nalls_2020", "Rizig_2023", "Jansen_2019", "Shigemizu_2021")))
# Transform Stimulation into a factor with levels for better control in plotting
gwas_studies$Ancestry <- factor(gwas_studies$Ancestry)
# Separate the metrics into individual plots with specific aesthetics
individuals_plot <- ggplot(gwas_studies, aes(x = "N_cases", y = fct_rev(Study))) +
geom_point(aes(size = N_cases), colour = "darkslategrey") + # Circle shape
scale_size_continuous(range = c(0.5, 4), name = "N cases") +
theme_minimal() +
theme(axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
legend.key.size = unit(0.4, 'lines')) +
labs(x = NULL, y = NULL)
celltypes_plot <- ggplot(gwas_studies, aes(x = "N_controls", y = fct_rev(Study))) +
geom_point(aes(size = N_controls), colour = "aquamarine3") + # Square shape
scale_size_continuous(range = c(1, 16), name = "N controls") +
theme_minimal() +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
legend.key.size = unit(0.4, 'lines')) +
labs(x = NULL)
ethnicity_plot <- ggplot(gwas_studies, aes(x = "Ancestry", y = fct_rev(Study))) +
geom_point(aes(color = Ancestry), shape = 15, size = 10) + # Same shape for all ethnicities
scale_color_manual(values = my_palette) +
labs(x = NULL, y = NULL, color = "Ethnicities") +
theme_minimal() +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
legend.key.size = unit(0.4, 'lines'))
# Combine the plots
gwas_combined_studies_plot <- (individuals_plot | celltypes_plot | ethnicity_plot) +
plot_layout(guides = "collect") &
theme(legend.position = "bottom",
legend.direction = "vertical")
# Print the combined plot
gwas_combined_studies_plotThe Aquino et al study was chosen as the baseline study for analysis given that it incorporated donors from multiple ancestries, multiple cell types and activation states.
This is a summary figure from the original paper, showing the study structure and cell types explored.
Data utilised in this analysis was derived from:
table_3d <- read_delim(str_c(args$aquino_eqtl_dir_path, "/aquino_supp_table_S3d.csv"),
skip = 3,
col_names = T)
# Update column names by combining first and second row
new_col_names <- paste(table_3d[1, ], table_3d[2, ], sep = "_")
names(table_3d) <- c("GeneID", "Symbol", new_col_names[-c(1,2)]) # Skip first two as they are GeneID and Symbol
# Remove the first two rows as they are now redundant
table_3d <- table_3d[-c(1, 2), ]
aquino_cell_order <- c("MYELOID",
"cDC",
"pDC",
"MONO CD14",
"MONO CD16",
"NK",
"NK CD56brt",
"NK CD56dim",
"NK MEM",
"ILC",
"T CD4",
"T CD4 E",
"T CD4 N",
"T CD8",
"T CD8 CM/EM",
"T CD8 EMRA",
"T CD8 N",
"Reg T",
"γδ T",
"MAIT",
"B",
"BMK",
"BML",
"BNK",
"BNL",
"Plasmablast")
# Convert the data to a long format
long_data <- table_3d %>%
pivot_longer(
cols = -c(GeneID, Symbol), # Excluding GeneID and Symbol columns from the pivot
names_to = c("Lineage", "virus"),
names_sep = "_", # Splitting based on the separator used in new column names
values_to = "log2TPM"
) %>%
dplyr::filter(Symbol %in% summary_gene_list) %>%
dplyr::left_join(.,
summary_genes,
by = c("Symbol"="gene")) %>%
dplyr::mutate(log2TPM = as.numeric(log2TPM),
virus = as_factor(virus) %>% fct_relevel(., c("NS", "IAV", "COV")),
Lineage = as_factor(Lineage) %>% fct_relevel(., aquino_cell_order)) %>%
group_by(Symbol)
genes_in_aquino <- tibble(gene = long_data$Symbol) %>%
dplyr::distinct(gene) %>%
dplyr::mutate(gene_in_aquino = "Expressed")
gene_expressed_in_data <- summary_genes %>%
dplyr::full_join(.,
genes_in_aquino,
by = c("gene")) %>%
dplyr::mutate(genes_in_aquino = case_when(gene_in_aquino == "Expressed" ~ "Expressed",
TRUE ~ "Not Expressed")) %>%
dplyr::select(-gene_in_aquino)
gene_expressed_in_data %>%
gt() %>%
opt_interactive()aquino_gene_count_data <- gene_expressed_in_data %>%
count(disease, genes_in_aquino)
aquino_gene_count <- aquino_gene_count_data %>%
ggplot(aes(x = n, y = fct_rev(disease), fill = fct_rev(genes_in_aquino))) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = my_palette,
guide = guide_legend(reverse = TRUE)) +
labs(x = "Number of genes", y = NULL, fill = NULL) +
theme_aw +
theme(legend.position = "bottom",
panel.grid.major.x = element_line(),
panel.grid.major.y = element_blank()
)
aquino_gene_countTo answer the question whether variants in NDD genes can influence Using Chen et al multi-ancestry phenotype QTL to explore whether peripheral blood counts are related to variation in NDD genes.
To do this, I have downloaded all associations from the locus-to-gene pipeline relating the the neurodegeneration genes above and this Chen paper via opentargets.
Note, the metaanalysis incorporating the various ancestries do not have betas associated, just p values.
Out of the 32 genes that have expression in peripheral immune cells (based on aquino), I will look at which of those genes have associations with blood cell count.
traits_to_explore <- c("White blood cell count",
"Monocyte count",
"Lymphocyte counts",
"Neutrophil count",
"Basophil count",
"Eosinophil counts",
"Platelet count",
"Mean platelet volume"
)
opentargets <- read_delim(args$chen_2020_locus_to_gene_path) %>%
dplyr::left_join(.,
gene_expressed_in_data %>%
dplyr::select(disease, gene, genes_in_aquino),
by = "gene") %>%
dplyr::filter(!gene %in% c("CTSB", "KAT8")) %>%
dplyr::select(study_id = `Study ID`,
gene,
disease,
Trait = `Reported Trait`,
N_participants = `Study N Initial`,
snp_ID = `Index Variant ID`,
snp_rs = `Index Variant RSID`,
p_value = `P-Value`,
beta = Beta,
beta_ci_low = `Beta CI Lower`,
beta_ci_high = `Beta CI Upper`,
l2g_score = `L2G Pipeline Score`,
has_sumstats = `Has sumstats`) %>%
dplyr::filter(Trait %in% traits_to_explore) %>%
dplyr::mutate(N_participants = as.numeric(N_participants))
substudy_info <- read_delim(args$chen_2020_substudy_lookup_table_path) %>%
dplyr::select(study_id = accessionId,
Trait = reportedTrait,
discoverySampleAncestry,
associationCount,
summaryStatistics) %>%
extract(discoverySampleAncestry, into = c("N_participants", "Ancestry"), regex = "^([0-9]+)\\s(.+)$", remove = TRUE) %>%
dplyr::mutate(N_participants = as.numeric(N_participants)) %>%
arrange(study_id) %>%
dplyr::select(-Trait, -N_participants)
included_chen_study_ids <- substudy_info %>%
dplyr::filter(Ancestry %in% c("African American or Afro-Caribbean, African unspecified",
"European",
"East Asian",
"Hispanic or Latin American",
"South Asian",
"African American or Afro-Caribbean, African unspecified, European, East Asian, Hispanic or Latin American, South Asian")) %>%
pull(study_id)
saveRDS(included_chen_study_ids,
file = file.path(base_dir, "/derived_data/chen_derived_data/included_chen_study_ids.RDS"))
substudy_info$Ancestry %>% unique()## [1] "African American or Afro-Caribbean, African unspecified"
## [2] "European"
## [3] "East Asian"
## [4] "Hispanic or Latin American"
## [5] "South Asian"
## [6] "African American or Afro-Caribbean, African unspecified, European, East Asian, Hispanic or Latin American, South Asian"
opentargets <- left_join(opentargets,
substudy_info,
by = c("study_id")) %>%
dplyr::relocate(study_id, Ancestry, N_participants, gene, disease, Trait, p_value, beta, beta_ci_low, beta_ci_high, everything()) %>%
dplyr::mutate(Ancestry = case_when(Ancestry == "African American or Afro-Caribbean, African unspecified, European, East Asian, Hispanic or Latin American, South Asian" ~ "Mixed",
Ancestry == "East Asian" ~ "Asian",
Ancestry == "European" ~ "European",
Ancestry == "South Asian" ~ "South Asian",
TRUE ~ "other")) %>%
dplyr::select(-summaryStatistics) %>%
arrange(Ancestry)
opentargets %>%
gt() %>%
opt_interactive(use_resizers = T,
use_filters = T
)mixed <- opentargets %>% dplyr::filter(Ancestry == "Mixed")
European <- opentargets %>% dplyr::filter(Ancestry == "European")
# Create combined category of disease and direction of beta
opentargets <- opentargets %>%
dplyr::mutate(trait_direction = case_when(beta >0 ~ str_c(Trait, " - positive β"),
beta<0 ~ str_c(Trait, " - negative β"),
TRUE ~ NA_character_))
# # Check number of NDD genes that have an association in Chen in white cell count metrics
# opentargets %>%
# dplyr::filter(Trait %in% c("Monocyte count",
# "Lymphocyte counts",
# "Neutrophil count") ) %>% #,
# # Ancestry == "European") %>%
# dplyr::select(gene) %>% unique()
chen_association_count <- opentargets %>%
dplyr::filter(Trait %in% c("White blood cell count",
"Monocyte count",
"Lymphocyte counts",
"Neutrophil count")) %>%
ggplot(aes(x = gene, fill = trait_direction)) +
geom_bar(stat = "count") +
facet_nested( Trait + Ancestry ~ disease,
scales = "free",
space = "free") +
scale_fill_manual(values = my_palette)+
scale_y_continuous(breaks = function(x) seq(ceiling(x[1]), floor(x[2]), by = 1)) +
theme_aw +
theme(legend.position = "right",
panel.grid.minor.y = element_blank()) +
labs(fill = "Trait direction", y = "Count")
ggsave(chen_association_count,
height = 34, width = 32, units = "cm",
file = file.path(base_dir, "figures/chen_wbc_mixed_association_counts.pdf"))
ggsave(chen_association_count,
height = 34, width = 32, units = "cm",
file = file.path(base_dir, "figures/chen_wbc_mixed_association_counts.png"))
chen_association_count # Create a tibble for genes by trait in Chen study
genes_in_chen <- European %>%
count(gene, Trait) %>%
dplyr::mutate(chen_assoc = "Blood count association")
# Defining the traits
traits <- c("White blood cell count", "Monocyte count", "Lymphocyte counts", "Neutrophil count")
# Create a data frame with all combinations of genes and traits
all_combinations <- expand.grid(gene = summary_genes$gene, Trait = traits)
# Join the all_combinations with the gene_expressed_in_chen table
expanded_table <- all_combinations %>%
dplyr::left_join(genes_in_chen, by = c("gene", "Trait")) %>%
dplyr::left_join(summary_genes, by = "gene") %>%
dplyr::mutate(genes_in_chen_relationship = ifelse(is.na(chen_assoc), "No association", chen_assoc))
chen_gene_count_data <- expanded_table %>%
group_by(disease) %>%
mutate(Trait = str_wrap(Trait, width = 16)) %>%
count(disease, Trait, genes_in_chen_relationship) %>%
dplyr::group_by(disease, Trait) %>%
dplyr::summarise(
total = sum(n), # Sum of all entries for each group
`Blood count association` = sum(n[genes_in_chen_relationship == "Blood count association"]),
`No association` = sum(n[genes_in_chen_relationship == "No association"])
) %>%
dplyr::mutate(
percent_blood_count_association = `Blood count association` / total * 100,
percent_no_association = `No association` / total * 100
)
chen_gene_count_data %>% gt()| Trait | total | Blood count association | No association | percent_blood_count_association | percent_no_association |
|---|---|---|---|---|---|
| AD | |||||
| Lymphocyte counts | 18 | 11 | 7 | 61.11111 | 38.88889 |
| Monocyte count | 18 | 10 | 8 | 55.55556 | 44.44444 |
| Neutrophil count | 18 | 12 | 6 | 66.66667 | 33.33333 |
| White blood cell count | 18 | 12 | 6 | 66.66667 | 33.33333 |
| FTD | |||||
| Lymphocyte counts | 6 | 4 | 2 | 66.66667 | 33.33333 |
| Monocyte count | 6 | 3 | 3 | 50.00000 | 50.00000 |
| Neutrophil count | 6 | 3 | 3 | 50.00000 | 50.00000 |
| White blood cell count | 6 | 5 | 1 | 83.33333 | 16.66667 |
| PD | |||||
| Lymphocyte counts | 15 | 4 | 11 | 26.66667 | 73.33333 |
| Monocyte count | 15 | 3 | 12 | 20.00000 | 80.00000 |
| Neutrophil count | 15 | 4 | 11 | 26.66667 | 73.33333 |
| White blood cell count | 15 | 5 | 10 | 33.33333 | 66.66667 |
chen_gene_count <- chen_gene_count_data %>%
dplyr::select(disease, Trait, `Blood count association`, `No association`) %>%
pivot_longer(cols = c("Blood count association", "No association"), names_to = "association", values_to = "count") %>%
ggplot(aes(x = disease, y = count, fill = fct_rev(association))) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = my_palette) +
labs(x = NULL, y = "Proportion of genes", fill = NULL) +
theme_aw +
theme(legend.position = "bottom") +
facet_grid(~ Trait)
chen_gene_countThis section explores the pattern of expression of NDD genes in peripheral immune cells. To identify which peripheral cell has the greatest expression, the results were normalised to the cell type and activation state with the greatest expression per gene. The figure shows the activation state with the maximum proportion per cell type.
table_3c <- read_delim(str_c(args$aquino_eqtl_dir_path, "/aquino_supp_table_S3c.csv"),
skip = 3,
col_names = T)
# Update column names by combining first and second row
new_col_names <- paste(table_3c[1, ], table_3c[2, ], sep = "_")
names(table_3c) <- c("GeneID", "Symbol", new_col_names[-c(1,2)]) # Skip first two as they are GeneID and Symbol
# Remove the first two rows as they are now redundant
table_3c <- table_3c[-c(1, 2), ]
aquino_cell_order <- tibble(Cell = c("cDC",
"pDC",
"MONO CD14",
"MONO CD16",
"NK CD56brt",
"NK CD56dim",
"NK MEM",
"ILC",
"T CD4 E",
"T CD4 N",
"Reg T",
"T CD8 CM/EM",
"T CD8 EMRA",
"T CD8 N",
"MAIT",
"γδ T",
"BMK",
"BML",
"BNK",
"BNL",
"Plasmablast"),
Lineage = c("Dendritic",
"Dendritic",
"Monocyte",
"Monocyte",
"NK",
"NK",
"NK",
"NK",
"CD4",
"CD4",
"CD4",
"CD8",
"CD8",
"CD8",
"CD8",
"CD8",
"B",
"B",
"B",
"B",
"B")
)
# Convert the data to a long format
long_data <- table_3c %>%
pivot_longer(
cols = -c(GeneID, Symbol), # Excluding GeneID and Symbol columns from the pivot
names_to = c("Cell", "virus"),
names_sep = "_", # Splitting based on the separator used in new column names
values_to = "log2TPM"
) %>%
dplyr::mutate(log2TPM = as.numeric(log2TPM),
TPM = 2^log2TPM) # Reversing the log2 transformation to get the original TPM values
filtered_data <- long_data %>%
dplyr::filter(Symbol %in% summary_gene_list) %>%
dplyr::left_join(.,
summary_genes,
by = c("Symbol"="gene")) %>%
dplyr::left_join(.,
aquino_cell_order,
by = "Cell") %>%
dplyr::mutate(#log2TPM = as.numeric(log2TPM),
virus = fct_relevel(virus, c("NS", "IAV", "COV")),
Cell = fct_relevel(Cell, aquino_cell_order$Cell),
Lineage = fct_relevel(Lineage, aquino_cell_order$Lineage %>% unique())) %>%
group_by(Symbol) %>%
mutate(
# TPM = 2^log2TPM, # Reversing the log2 transformation to get the original TPM values
proportion_expression_per_gene = TPM / max(TPM)) %>% # Calculating the proportion to allow comparison within a gene
ungroup()
# Function to wrap text at 20 characters
wrap_text <- function(x) {
sapply(x, function(y) str_wrap(y, width = 8)) # Adjust width as needed
}
# Legend width paremeter
legend_width <- unit(1.5, "lines")
max_proportion_per_treatment_data <- filtered_data %>%
as_tibble() %>%
dplyr::group_by(Symbol, Cell) %>%
dplyr::slice_max(proportion_expression_per_gene, n = 1, with_ties = FALSE) %>% # Select max proportional value out of the NS, COV and IAV treatments for each celltype and gene
ungroup()
dendritic_cell_genes <- c("APP", "PSEN2", "PARK7")
# Find ratio of values between CD14 monocytes and CD56dim NK cells (for whichever treatment), to explore the pattern between different genes
mono_nk_ratio <- max_proportion_per_treatment_data %>%
dplyr::filter(Cell %in% c("MONO CD14", "NK CD56dim")) %>%
dplyr::select(-Lineage, -TPM, -log2TPM, -virus) %>%
pivot_wider(names_from = Cell,
values_from = proportion_expression_per_gene) %>%
dplyr::mutate(log_mono_nk_ratio = log10(`MONO CD14` / `NK CD56dim`)) %>%
arrange(log_mono_nk_ratio) %>%
dplyr::select(Symbol, disease, log_mono_nk_ratio, cd14_mono_prop = `MONO CD14`, cd56dim_nk_prop = `NK CD56dim`)
max_proportion_per_treatment_data <- max_proportion_per_treatment_data %>%
left_join(.,
mono_nk_ratio,
by = c("Symbol", "disease")) %>%
dplyr::mutate(Symbol = fct_reorder(Symbol, -cd14_mono_prop), # Reorder factors by descencing CD14 Monocyte proportion values
Symbol = fct_relevel(Symbol, dendritic_cell_genes, after = 0))
cell_factor_levels <- levels(max_proportion_per_treatment_data$Symbol)
aquino_cell_expression_plot_max <- max_proportion_per_treatment_data %>%
ggplot(aes(x = Cell, y = fct_rev(Symbol), fill = proportion_expression_per_gene)) +
geom_tile() +
facet_nested(disease ~ Lineage,
scales = "free",
space = "free",
labeller = labeller(Lineage = wrap_text),
) + # Applying wrap_text to Lineage labels
theme_aw +
theme(legend.key.size = legend_width,
legend.position = "bottom",
strip.background.x = element_rect(color="black", linewidth=1, linetype="solid"),
legend.title = element_text(vjust = 0.7)) + # Center-align legend title
scale_fill_viridis_c() +
labs(title = "Proportion of gene expression per gene",
subtitle = "Maximal proportion over treatments, ordered by CD14-Mono and DC",
fill = "Proportion expression",
x = NULL,
y = "Gene")
aquino_cell_expression_plot_maxTo identify the specificity of gene expression by cell type and activation state, a specificity metric tau is used. The metric ranges from 0-1, with a higher value indicating greater specificity.
# Tau across treatment
max_proportion_per_treatment_data <- as_tibble( max_proportion_per_treatment_data) %>%
mutate(Cell = as.character(Cell))
# Tau by celltypes
wide_data <- max_proportion_per_treatment_data %>%
dplyr::select(-virus, -log2TPM, -log_mono_nk_ratio, -cd14_mono_prop, -cd56dim_nk_prop, -TPM, -Lineage) %>%
pivot_wider(names_from = Cell, values_from = proportion_expression_per_gene)
# Tau by celltypes
wide_data <- max_proportion_per_treatment_data %>%
dplyr::select(-virus, -log2TPM, -log_mono_nk_ratio, -cd14_mono_prop, -cd56dim_nk_prop, -TPM, -Lineage) %>%
tidyr::pivot_wider(names_from = Cell, values_from = proportion_expression_per_gene)
tau_results <- wide_data %>%
dplyr::rowwise() %>%
dplyr::mutate(
tau = {
x <- dplyr::c_across(where(is.numeric)) # This ensures only numeric columns, typically the cell data, are used.
x_hat <- x / max(x, na.rm = TRUE)
tau_value <- sum(1 - x_hat, na.rm = TRUE) / (length(x) - 1)
tau_value
}
) %>%
dplyr::ungroup() %>%
dplyr::select(GeneID, Symbol, tau_per_cell = tau)
# Finding the cell with the maximum proportion_expression_per_gene for each gene
max_cell_per_gene <- max_proportion_per_treatment_data %>%
as_tibble() %>%
dplyr::group_by(GeneID, Symbol) %>% # Group by GeneID and Symbol to handle multiple genes
dplyr::summarise(
Max_Cell = Cell[which.max(proportion_expression_per_gene)] # Get the cell with the maximum proportion
) %>%
ungroup() # Ungroup for further operations if necessary
cell_tau <- left_join(tau_results,
max_cell_per_gene,
by = c("GeneID", "Symbol")) %>%
dplyr::mutate(Lineage = "Cell type") %>%
dplyr::left_join(.,
max_proportion_per_treatment_data %>%
dplyr::select(Symbol, disease),
by = "Symbol") %>%
dplyr::mutate(Symbol = fct_relevel(Symbol, cell_factor_levels)) %>%
group_by(Symbol) %>%
slice(1) %>%
ungroup()
cell_tau_plot <- cell_tau %>%
ggplot(aes(x = "Max Cell, τ", y = fct_rev(Symbol), fill = tau_per_cell)) +
geom_tile() + # Tiles for heatmap
geom_text(aes(label = Max_Cell), color = "white", size = 3, vjust = 0.4) + # Add labels for Max_Cell
scale_fill_gradient(low = "grey90", high = "darkslategrey", limits = c(0, 1)) + # Gradient fill
facet_nested(disease ~ Lineage, scales = "free", space = "free") +
labs(x = NULL, y = NULL, fill = "τ") + # Remove x-axis title, adjust legend title
theme_minimal(base_family = "Helvetica", base_size = 14) +
theme(
axis.title.y = element_blank(), # Remove y-axis title
axis.text.y = element_blank(), # Remove y-axis text if unnecessary
axis.ticks.y = element_blank(), # Remove y-axis ticks
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
panel.spacing = unit(0, "lines"), # Remove space between panels
strip.background.x = element_rect(fill = NA, colour = "black", linewidth = 1), # Black box around facet headings
plot.margin = unit(c(1, 1, 1, 1), "points"), # Adjust plot margins
legend.position = "bottom",
legend.title = element_text(vjust = 0.7),
legend.key.size = legend_width
)
cell_tau %>% gt() %>%
opt_interactive()# Find number of cell-types that contain the maximal gene count
cell_tau %>%
count(Max_Cell) %>%
dplyr::mutate(percentage = n/34*100) %>%
gt() %>%
opt_interactive()dc_mono_values <- cell_tau %>% dplyr::filter(Max_Cell %in% c("pDC", "cDC", "MONO CD14", "MONO CD16")) %>%
dplyr::select(tau_per_cell) %>% unlist()
NK_values <- cell_tau %>% dplyr::filter(Max_Cell %in% c("NK CD56brt", "NK CD56dim")) %>%
dplyr::select(tau_per_cell) %>% unlist()
# Calculate mean and standard deviation for the first group
dc_mono_stats <- cell_tau %>%
dplyr::filter(Max_Cell %in% c("pDC", "cDC", "MONO CD14", "MONO CD16")) %>%
summarise(
mean_tau_per_cell = mean(tau_per_cell, na.rm = TRUE),
sd_tau_per_cell = sd(tau_per_cell, na.rm = TRUE)
)
# Calculate mean and standard deviation for the second group
NK_stats <- cell_tau %>%
filter(Max_Cell %in% c("NK CD56brt", "NK CD56dim")) %>%
summarise(
mean_tau_per_cell = mean(tau_per_cell, na.rm = TRUE),
sd_tau_per_cell = sd(tau_per_cell, na.rm = TRUE)
)
# Compare the two groups using a t-test
t_test_results <- t.test(x = dc_mono_values,
y = NK_values,
alternative = "two.sided",
var.equal = F)
t_test_results##
## Welch Two Sample t-test
##
## data: dc_mono_values and NK_values
## t = 2.9989, df = 12.77, p-value = 0.01044
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.05330697 0.32980859
## sample estimates:
## mean of x mean of y
## 0.8234846 0.6319268
expression_tau_plot <- (aquino_cell_expression_plot_max + labs(title = NULL, subtitle = NULL) |
cell_tau_plot + theme(strip.text.y = element_blank()) |
activationState_tau_plot + theme(axis.text.y = element_text(),
)) +
plot_layout(widths = c(8,1,1), guides = "collect") &
theme(legend.position = "bottom")
expression_tau_plottable_4b <- read_delim(str_c(args$aquino_eqtl_dir_path, "/aquino_supp_table_S4b.csv"),
skip = 7,
col_names = T) %>%
dplyr::filter(Symbol %in% summary_gene_list,
FDR_lineage_adj <=0.05) %>%
# popDE_lineage_adj == "yes") %>%
dplyr::select(-beta_raw, -FDR_raw, -popDE_raw, -t_delta_beta, -popDE_lineage_adj) %>%
left_join(.,
summary_genes,
by = c("Symbol" = "gene")) %>%
relocate(Lineage, disease)
table_4b_condensed <- table_4b %>%
group_by(Lineage, Symbol) %>%
summarise(
max_beta_lineage_adj_per_Condition = beta_lineage_adj[which.max(abs(beta_lineage_adj))],
disease = disease[which.max(abs(beta_lineage_adj))],
Condition = Condition[which.max(abs(beta_lineage_adj))],
.groups = 'drop' # Drops the grouping structure after summarizing
)
euro_vs_african_expression_plot <- table_4b_condensed %>%
ggplot(aes(x = fct_relevel(Lineage, aquino_cell_order$Lineage),
y = fct_rev(Symbol),
fill = max_beta_lineage_adj_per_Condition)) +
geom_tile() +
facet_grid(disease ~ .,
scales = "free",
space = "free") +
scale_fill_gradient2(low = "darkblue", high = "darkred", mid = "white",
midpoint = 0, # This sets the midpoint at 0
limit = c(min(table_4b$beta_lineage_adj, na.rm = TRUE),
max(table_4b$beta_lineage_adj, na.rm = TRUE)),
space = "Lab",
name = "Log2 fold change") +
scale_x_discrete(labels = plot_labeller) +
theme_aw +
theme(legend.key.size = legend_width) +
labs(fill = "Log2fold change",
x = NULL, #"Cell lineage",
y = "Gene")
euro_vs_african_expression_plot +
labs(title = "DGE of cell lineages by ancestry",
subtitle = "European relative to African")fig1_gene_expression_chen <-
(
(aquino_gene_count + theme_aw + theme(legend.direction = "vertical",
legend.title = element_text(vjust = 1)) |
chen_gene_count +theme_aw + theme(legend.position = "bottom",
legend.title = element_text(vjust = 1)) |
euro_vs_african_expression_plot + theme_aw + theme(legend.position = "bottom",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.key.width = unit(0.8, "cm"),
legend.title = element_text(vjust = 1))
)+
plot_layout(guides = "keep", widths = c(1,4,1.5))
) /
(
(aquino_cell_expression_plot_max + theme_aw +
theme(legend.key.width = unit(0.8, "cm"),
legend.title = element_text(vjust = 1)) +
labs(title = NULL, subtitle = NULL) |
cell_tau_plot + theme_aw +theme(strip.text.y = element_blank(),
legend.key.width = unit(0.8, "cm"),
legend.title = element_text(vjust = 1)) |
activationState_tau_plot + labs(x = NULL) + theme_aw + theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.key.width = unit(0.8, "cm"),
legend.title = element_text(vjust = 1)
)
) +
plot_layout(widths = c(10,1.6,1.6), guides = "collect")
) +
plot_layout(heights = c(3,8)) +
plot_annotation(tag_levels = list(c("a", "b", "c", "d", "e", "f")
)
)
fig1_gene_expression_chenthesis_fig_3 <- (aquino_gene_count + theme_aw + theme(legend.direction = "vertical",
legend.title = element_text(vjust = 1)) |
enrichment_plot) +
plot_annotation(tag_levels = "a")
ggsave(thesis_fig_3,
file = file.path(base_dir, "figures/thesis3_aquino_gene_count.pdf"),
width = 18, height = 11, units = "cm")thesis_figure_chen <- (chen_gene_count +
theme_aw +
theme(legend.direction = "vertical",
legend.position = "right",
legend.title = element_text(vjust = 1))) /
chen_association_count +
plot_annotation(tag_levels = "a") +
plot_layout(heights = c(1,4))
ggsave(thesis_figure_chen,
file = file.path(base_dir, "figures/thesis_chen_associations.png"),
width = 38, height = 45, units = "cm")
thesis_figure_chenthesis_expression_tau_plot <- (aquino_cell_expression_plot_max + theme_aw +
theme(legend.key.width = unit(0.8, "cm"),
legend.title = element_text(vjust = 1)) +
labs(title = NULL, subtitle = NULL) |
cell_tau_plot + theme_aw +theme(strip.text.y = element_blank(),
legend.key.width = unit(0.8, "cm"),
legend.title = element_text(vjust = 1)) |
activationState_tau_plot + labs(x = NULL) + theme_aw + theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.key.width = unit(0.8, "cm"),
legend.title = element_text(vjust = 1))
) + plot_layout(widths = c(8,1,1), guides = "collect") +
plot_annotation(tag_levels = "a") &
theme(legend.position = "bottom")
thesis_expression_tau_plotggsave(thesis_expression_tau_plot,
file = file.path(base_dir, "figures/thesis_expression_tau.png"),
units = "cm", width = 36, height = 30)ggsave(euro_vs_african_expression_plot,
file = file.path(base_dir, "figures/thesis_eur_afr_dge.png"),
units = "cm", width = 12, height = 16)## R version 4.2.0 (2022-04-22)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 14.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] httr_1.4.7 gt_0.11.0 forcats_1.0.0 stringr_1.5.1
## [5] dplyr_1.1.0 purrr_1.0.1 readr_2.1.5 tidyr_1.3.1
## [9] tibble_3.2.0 tidyverse_1.3.2 scales_1.3.0 readxl_1.4.1
## [13] ggh4x_0.2.8 ggplot2_3.4.2 viridis_0.6.2 viridisLite_0.4.1
## [17] wesanderson_0.3.6 paletteer_1.4.1 sciRmdTheme_0.1 patchwork_1.2.0
## [21] here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] fs_1.6.4 reactable_0.4.4 lubridate_1.8.0
## [4] bit64_4.0.5 rprojroot_2.0.3 tools_4.2.0
## [7] backports_1.4.1 bslib_0.3.1 utf8_1.2.2
## [10] R6_2.5.1 DBI_1.2.2 colorspace_2.1-0
## [13] withr_2.5.0 tidyselect_1.2.1 gridExtra_2.3
## [16] processx_3.8.4 bit_4.0.5 compiler_4.2.0
## [19] textshaping_0.3.6 cli_3.6.3 rvest_1.0.4
## [22] xml2_1.3.6 labeling_0.4.3 bookdown_0.29
## [25] sass_0.4.1 systemfonts_1.0.6 digest_0.6.35
## [28] rmarkdown_2.14 pkgconfig_2.0.3 htmltools_0.5.7
## [31] highr_0.9 dbplyr_2.2.1 fastmap_1.1.1
## [34] htmlwidgets_1.6.4 rlang_1.1.1 rstudioapi_0.13
## [37] farver_2.1.1 jquerylib_0.1.4 generics_0.1.3
## [40] jsonlite_1.8.8 crosstalk_1.2.1 vroom_1.6.5
## [43] googlesheets4_1.1.1 magrittr_2.0.3 Rcpp_1.0.12
## [46] munsell_0.5.1 fansi_1.0.3 lifecycle_1.0.3
## [49] stringi_1.7.6 yaml_2.3.5 grid_4.2.0
## [52] parallel_4.2.0 promises_1.3.0 crayon_1.5.2
## [55] haven_2.5.4 chromote_0.3.1 hms_1.1.2
## [58] knitr_1.40 ps_1.7.6 pillar_1.9.0
## [61] reprex_2.1.0 glue_1.6.2 evaluate_0.23
## [64] modelr_0.1.11 vctrs_0.6.5 tzdb_0.4.0
## [67] cellranger_1.1.0 gtable_0.3.1 reactR_0.6.0
## [70] rematch2_2.1.2 assertthat_0.2.1 xfun_0.34
## [73] broom_1.0.5 later_1.3.2 ragg_1.2.5
## [76] googledrive_2.1.0 gargle_1.5.0 websocket_1.4.1
## [79] ellipsis_0.3.2